load("02_11_m5s_user_activity_univ_only_timeframe.RData")
require(ggplot2)
require(gridExtra)

user_activity_univ_only_timeframe$male <-
  user_activity_univ_only_timeframe$gender == "male"

user_activity_univ_only_timeframe$fb_bool <- user_activity_univ_only_timeframe$facebook > 0
user_activity_univ_only_timeframe$mu_bool <- user_activity_univ_only_timeframe$meetup > 0
user_activity_univ_only_timeframe$bg_bool <- user_activity_univ_only_timeframe$blog > 0
user_activity_univ_only_timeframe$fr_bool <- user_activity_univ_only_timeframe$forum > 0

user_activity_univ_only_timeframe$n_platf <- with(user_activity_univ_only_timeframe, 
                                                  fb_bool + mu_bool + bg_bool + fr_bool)

user_activity_univ_only_timeframe_mean <- mean(user_activity_univ_only_timeframe$timediff)
user_activity_univ_only_timeframe_median <- median(user_activity_univ_only_timeframe$timediff)

# Percentage of Meetup users active in others
meetup_users <- sum(user_activity_univ_only_timeframe$mu_bool)
meetu_users_active_outside <- with(user_activity_univ_only_timeframe,
                                   sum(mu_bool == TRUE &
                                    (fb_bool == TRUE | fr_bool == TRUE | bg_bool == TRUE)
                                  ))
meetu_users_active_outside / meetup_users
meetu_users_active_fb <- with(user_activity_univ_only_timeframe,
                                   sum(mu_bool == TRUE &
                                         (fb_bool == TRUE)
                                   ))
meetu_users_active_fb / meetup_users

meetu_users_posting <- with(user_activity_univ_only_timeframe,
                              sum(mu_bool == TRUE &
                                    (comment > 0 | post > 0) & 
                                    (fb_bool == FALSE)
                              ))
meetu_users_posting / meetup_users

posting_users <- sum(user_activity_univ_only_timeframe$comment > 0 | user_activity_univ_only_timeframe$post > 0)
meetu_users_posting / posting_users

posting_users_on_fr <-  with(user_activity_univ_only_timeframe,
                            sum(fr_bool == TRUE))
meetu_users_posting_on_fr <-  with(user_activity_univ_only_timeframe,
                              sum((fr_bool == TRUE) & mu_bool == TRUE))
meetu_users_posting_on_fr / posting_users_on_fr
# [1] 2.30466

sum(user_activity_univ_only_timeframe$mu_bool == TRUE &
      user_activity_univ_only_timeframe$fr_bool == TRUE)

p1 <- ggplot(user_activity_univ_only_timeframe, aes(timediff/(3600*24))) + geom_density() +
#   geom_vline(xintercept = user_activity_univ_only_timeframe_mean/(3600*24), alpha=0.4, colour="red") +
#   geom_vline(xintercept = user_activity_univ_only_timeframe_median/(3600*24), alpha=0.4, colour="blue") +
  labs(x=paste0("Days (n=",formatC(nrow(user_activity_univ_only_timeframe), format="d", big.mark=','),")")) +
  scale_x_continuous(breaks=seq(0,3000,500))

all_timeframe_stats <- list()
all_timeframe_stats[['n']] <- nrow(user_activity_univ_only_timeframe)
# 2410422
all_timeframe_stats[['gender_tbl']] <- prop.table(table(user_activity_univ_only_timeframe$gender)[1:2])
# female      male 
# 0.3783071 0.6216929 

timeframe_stats <-list()
for (plt in c("facebook","meetup","forum","blog")) {
  tmp <- subset(user_activity_univ_only_timeframe, get(plt)==TRUE)
  timeframe_stats[[paste0(plt, "_n")]] <- nrow(tmp)
  timeframe_stats[[paste0(plt, "_gender")]] <- prop.table(table(tmp$gender)[1:2])
}

# sequence <- c(0,5,10,25,50,100,250,500,750,1000,2000,3000)
# gender_gap_by_activity <- data.frame()
# for (i in 1:length(sequence)) {
#   if (sequence[i] == 0) {
#     tmp <- subset(user_activity_univ_only_timeframe, timediff == sequence[i] * (3600 * 24))
#     tmp_fb <- subset(user_activity_univ_only_timeframe, timediff == sequence[i] * (3600 * 24) & 
#                        facebook==TRUE)
#     tmp_mu <- subset(user_activity_univ_only_timeframe, timediff == sequence[i] * (3600 * 24) & 
#                        meetup==TRUE)
#     tmp_bg <- subset(user_activity_univ_only_timeframe, timediff == sequence[i] * (3600 * 24) & 
#                        blog==TRUE)
#     tmp_fr <- subset(user_activity_univ_only_timeframe, timediff == sequence[i] * (3600 * 24) & 
#                        forum==TRUE)
#   } else {
#     tmp <- subset(user_activity_univ_only_timeframe, timediff <= sequence[i] * (3600 * 24) &
#                     timediff > sequence[i-1] * (3600 * 24))
#     tmp_fb <- subset(user_activity_univ_only_timeframe, timediff <= sequence[i] * (3600 * 24) & 
#                        facebook==TRUE &
#                        timediff > sequence[i-1] * (3600 * 24))
#     tmp_mu <- subset(user_activity_univ_only_timeframe, timediff <= sequence[i] * (3600 * 24) & 
#                        meetup==TRUE &
#                        timediff > sequence[i-1] * (3600 * 24))
#     tmp_bg <- subset(user_activity_univ_only_timeframe, timediff <= sequence[i] * (3600 * 24) & 
#                        blog==TRUE &
#                        timediff > sequence[i-1] * (3600 * 24))
#     tmp_fr <- subset(user_activity_univ_only_timeframe, timediff <= sequence[i] * (3600 * 24) & 
#                        forum==TRUE &
#                        timediff > sequence[i-1] * (3600 * 24))
#   }
#   gender_gap_by_activity <- rbind(gender_gap_by_activity,
#                                   data.frame(
#                                     seq = sequence[i],
#                                     male_tot = prop.table(table(tmp$gender)[1:2])[2],
#                                     female_tot = prop.table(table(tmp$gender)[1:2])[1],
#                                     n_tot = nrow(tmp) / all_timeframe_stats[['n']],
#                                     male_facebook = prop.table(table(tmp_fb$gender)[1:2])[2],
#                                     female_facebook = prop.table(table(tmp_fb$gender)[1:2])[1],
#                                     n_facebook = nrow(tmp_fb) / timeframe_stats[['facebook_n']],
#                                     male_forum = prop.table(table(tmp_fr$gender)[1:2])[2],
#                                     female_forum = prop.table(table(tmp_fr$gender)[1:2])[1],
#                                     n_forum = nrow(tmp_fr) / timeframe_stats[['forum_n']],
#                                     male_meetup = prop.table(table(tmp_mu$gender)[1:2])[2],
#                                     female_meetup = prop.table(table(tmp_mu$gender)[1:2])[1],
#                                     n_meetup = nrow(tmp_mu) / timeframe_stats[['meetup_n']],
#                                     male_blog = prop.table(table(tmp_bg$gender)[1:2])[2],
#                                     female_blog = prop.table(table(tmp_bg$gender)[1:2])[1],
#                                     n_blog = nrow(tmp_bg) / timeframe_stats[['blog_n']]))  
# }
# require(reshape2)
# require(directlabels)
# gender_gap_by_activity$seq <- factor(gender_gap_by_activity$seq, levels=sequence)

# p3 <- ggplot(subset(melt(gender_gap_by_activity, id="seq"), grepl('female', variable)), 
#        aes(x=seq, y=value, group=variable, colour=variable)) +
#   geom_line() +
#   geom_dl(aes(label=as.factor(variable)),
#           method="top.bumpup") +
#   guides(colour=FALSE) + 
#   scale_y_continuous(labels=scales::percent)  +
#   labs(x=NULL, y=NULL)
# 
# p4 <- ggplot(subset(melt(gender_gap_by_activity, id="seq"), !(grepl('male', variable))), 
#        aes(x=seq, y=value, group=variable, colour=variable)) +
#   geom_line() +
#   geom_dl(aes(label=as.factor(variable)),
#           method="top.bumpup") +
#   guides(colour=FALSE) + 
#   scale_y_continuous(labels=scales::percent) +
#   labs(x="days (brackets)", y=NULL)

one_touch <- sum(user_activity_univ_only_timeframe$timediff == 0)

one_touch_df <- subset(user_activity_univ_only_timeframe,
                       timediff == 0)
prop.table(table(one_touch_df$gender)[1:2])
# female      male 
# 0.4292021 0.5707979  

multi_touch_df <- subset(user_activity_univ_only_timeframe,
                         timediff > 0)
prop.table(table(multi_touch_df$gender)[1:2])

multi_touch_mean <- mean(multi_touch_df$timediff)
multi_touch_median <- median(multi_touch_df$timediff)
# female      male 
# 0.3605214 0.6394786 
p2 <- ggplot() + # geom_density(data=multi_touch_df, aes(timediff/(3600*24))) +
  geom_density(data=subset(multi_touch_df, gender=="female"), aes(timediff/(3600*24), color="#female")) +
  geom_density(data=subset(multi_touch_df, gender=="male"), aes(timediff/(3600*24), colour="#male")) +
  # geom_vline(xintercept = multi_touch_mean/(3600*24), alpha=0.4, colour="red") +
  # geom_vline(xintercept = multi_touch_median/(3600*24), alpha=0.4, colour="blue") +
  labs(x=paste0("Days (n=",formatC(sum(multi_touch_df == "male" | multi_touch_df == "female", na.rm=TRUE), format="d", big.mark=','),")")) +
  scale_colour_manual(name= "", values =c('#female'='#e41a1c','#male'='#377eb8'), 
                      labels = c('female','male')) +
  theme(legend.position="bottom")

p3 <- ggplot() + # geom_density(alpha=.1, data=multi_touch_df, aes(timediff/(3600*24))) +
  geom_density(data=subset(multi_touch_df, bg_bool==TRUE), aes(timediff/(3600*24), color="#blog")) +
  geom_density(data=subset(multi_touch_df, fb_bool==TRUE), aes(timediff/(3600*24), color="#facebook")) +
  geom_density(data=subset(multi_touch_df, fr_bool==TRUE), aes(timediff/(3600*24), color="#forum")) +
  geom_density(data=subset(multi_touch_df, mu_bool==TRUE), aes(timediff/(3600*24), color="#meetup")) +
  # geom_vline(xintercept = multi_touch_mean/(3600*24), alpha=0.4, colour="red") +
  # geom_vline(xintercept = multi_touch_median/(3600*24), alpha=0.4, colour="blue") +
  labs(x="Days (only users with at least 2 occurrences)") +
  scale_x_continuous(breaks=seq(0,3000,500)) +
  scale_colour_manual(name= "", values =c('#blog'='#e41a1c','#facebook'='#377eb8',
                                          "#forum"="#4daf4a","#meetup"="#ff7f00"), 
                      labels = c('Blog','Facebook','Forum','Meetup')) +
  theme(legend.position="bottom")

# ggplot(multi_touch_df, aes(timediff)) + geom_density() + scale_x_log10()


# Regressions

# glm1 <- glm(timediff ~ male, data=user_activity_univ_only_timeframe, family="poisson")
# glm2 <- glm(timediff ~ male + mu_bool, data=user_activity_univ_only_timeframe, family="poisson")
# glm3 <- glm(timediff ~ male + mu_bool + fb_bool, data=user_activity_univ_only_timeframe, family="poisson")
# glm4 <- glm(timediff ~ male + mu_bool + fb_bool + bg_bool, data=user_activity_univ_only_timeframe, family="poisson")
# glm5 <- glm(timediff ~ male + mu_bool + fb_bool + bg_bool + fr_bool, data=user_activity_univ_only_timeframe, family="poisson")
# glm6 <- glm(timediff ~ male + n_platf, data=user_activity_univ_only_timeframe, family="poisson")

# 
# lm1 <- lm(log10(timediff) ~ male + facebook + meetup  + blog, data=multi_touch_df)
# summary(lm1)$r.squared
